# Data Manipulation Modules
import pandas as pd
import numpy as np
import scipy as sp
import numpy as np
import math
import scipy.stats as stats
from collections import Counter
# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as ms
# Managing Warnings
import warnings
warnings.filterwarnings('ignore')
#Final = pd.read_pickle("../data/Final.pkl")
Final = pd.read_pickle("../data/Final_5.pkl")
Stats = Final[['Season', 'Receiver', 'Nearest_Defender', 'Separation', 'Pred_Sep', 'TD', 'Pass Yds', 'INT', 'Cmp', 'Team', 'Air_Yds', 'Cushion']]
Stats['Sep_Diff'] = Stats['Separation'] - Stats['Pred_Sep']
COUNT_df = Stats.groupby(['Receiver', 'Season']).nunique()
COUNT_df = COUNT_df.rename(columns = {'Separation':'Targets'})
COUNT_df = COUNT_df[['Targets']]
Final_df = Stats.merge(COUNT_df, on = ['Receiver', 'Season'])
Below is a table showing how correlated a receivers average Separation Differential is year-over-year. Turns out, among receievers with at least 70 targets in each of the last 3 years, there is a low, but non-zero correlation between those figures over three seasons. Considering the variable nature of separation, this is to be expected, but I would be worth looking into why these correlation values aren't more stable year over year.
YoY = Final_df[Final_df['Targets'] >= 70]
YoY = YoY.groupby(['Receiver', 'Season'])[['Separation', 'Pred_Sep', 'Sep_Diff']].mean()
YoY = pd.pivot_table(YoY, values='Sep_Diff',index='Receiver', columns=['Season']).corr().round(3)
YoY
| Season | 2017 | 2018 | 2019 |
|---|---|---|---|
| Season | |||
| 2017 | 1.000 | 0.338 | 0.29 |
| 2018 | 0.338 | 1.000 | 0.45 |
| 2019 | 0.290 | 0.450 | 1.00 |
Season = Final_df[['Receiver', 'Separation', 'Pred_Sep', 'Sep_Diff', 'Season', 'Targets']]
Season = Season.groupby(['Receiver', 'Season']).mean().reset_index().round(2)
#Create df since 2017
Since17 = Stats[['Receiver', 'Separation', 'Pred_Sep', 'Sep_Diff']]
Since17_df = Since17.groupby(['Receiver']).mean().reset_index().round(2)
COUNT = Stats.groupby(['Receiver']).nunique()
COUNT = COUNT.rename(columns = {'Separation':'Targets'})
COUNT_1 = COUNT[['Targets']]
Since17_df = Since17_df.merge(COUNT_1, on = ['Receiver'])
Career = Since17_df[Since17_df['Targets'] > 175].sort_values(['Sep_Diff'], ascending = False).reset_index(drop = True)
Career.head(10)
| Receiver | Separation | Pred_Sep | Sep_Diff | Targets | |
|---|---|---|---|---|---|
| 0 | Sammy Watkins | 2.96 | 2.55 | 0.42 | 253 |
| 1 | Tyreek Hill | 3.26 | 2.87 | 0.38 | 373 |
| 2 | Davante Adams | 3.04 | 2.69 | 0.35 | 432 |
| 3 | Ted Ginn | 3.02 | 2.73 | 0.29 | 188 |
| 4 | Cooper Kupp | 3.34 | 3.08 | 0.26 | 296 |
| 5 | Calvin Ridley | 3.02 | 2.78 | 0.24 | 184 |
| 6 | Amari Cooper | 2.76 | 2.55 | 0.22 | 336 |
| 7 | Tyler Lockett | 3.11 | 2.90 | 0.22 | 270 |
| 8 | Robby Anderson | 2.68 | 2.49 | 0.19 | 299 |
| 9 | Corey Davis | 2.87 | 2.68 | 0.19 | 271 |
Career.sort_values(['Sep_Diff'], ascending = True).reset_index(drop = True).head(10)
| Receiver | Separation | Pred_Sep | Sep_Diff | Targets | |
|---|---|---|---|---|---|
| 0 | Golden Tate | 2.66 | 3.17 | -0.51 | 324 |
| 1 | Larry Fitzgerald | 2.60 | 3.00 | -0.40 | 378 |
| 2 | Rob Gronkowski | 2.39 | 2.74 | -0.35 | 209 |
| 3 | Allen Robinson | 2.14 | 2.45 | -0.31 | 260 |
| 4 | Mike Williams | 1.95 | 2.20 | -0.25 | 194 |
| 5 | Jared Cook | 2.75 | 3.00 | -0.25 | 254 |
| 6 | DeVante Parker | 2.22 | 2.46 | -0.24 | 267 |
| 7 | Kenny Golladay | 2.12 | 2.35 | -0.23 | 273 |
| 8 | Marvin Jones | 2.14 | 2.31 | -0.17 | 258 |
| 9 | Demaryius Thomas | 2.54 | 2.69 | -0.15 | 281 |
df_2019 = Season[(Season['Targets'] > 85) & (Season['Season'] == 2019)].sort_values(['Sep_Diff'], ascending = False).reset_index(drop = True)
df_2019.head(10)
| Receiver | Season | Separation | Pred_Sep | Sep_Diff | Targets | |
|---|---|---|---|---|---|---|
| 0 | Davante Adams | 2019 | 3.31 | 2.75 | 0.55 | 149 |
| 1 | Diontae Johnson | 2019 | 3.62 | 3.10 | 0.52 | 92 |
| 2 | Tyler Higbee | 2019 | 3.70 | 3.24 | 0.46 | 88 |
| 3 | Tyreek Hill | 2019 | 3.23 | 2.77 | 0.46 | 114 |
| 4 | Christian Kirk | 2019 | 3.48 | 3.07 | 0.41 | 106 |
| 5 | Travis Kelce | 2019 | 3.02 | 2.68 | 0.34 | 157 |
| 6 | Sammy Watkins | 2019 | 3.03 | 2.76 | 0.28 | 108 |
| 7 | John Brown | 2019 | 2.47 | 2.24 | 0.23 | 123 |
| 8 | Tyler Lockett | 2019 | 3.13 | 2.90 | 0.23 | 126 |
| 9 | Cooper Kupp | 2019 | 3.39 | 3.17 | 0.21 | 134 |
df_2019.sort_values(['Sep_Diff'], ascending = True).reset_index(drop = True).head(10)
| Receiver | Season | Separation | Pred_Sep | Sep_Diff | Targets | |
|---|---|---|---|---|---|---|
| 0 | Marvin Jones | 2019 | 2.09 | 2.55 | -0.47 | 91 |
| 1 | Mike Gesicki | 2019 | 2.62 | 3.03 | -0.41 | 89 |
| 2 | Julio Jones | 2019 | 2.16 | 2.48 | -0.31 | 156 |
| 3 | Kenny Golladay | 2019 | 1.95 | 2.22 | -0.27 | 113 |
| 4 | DeVante Parker | 2019 | 2.03 | 2.29 | -0.26 | 126 |
| 5 | Allen Robinson | 2019 | 2.26 | 2.52 | -0.26 | 152 |
| 6 | Terry McLaurin | 2019 | 2.10 | 2.35 | -0.25 | 92 |
| 7 | Larry Fitzgerald | 2019 | 2.98 | 3.19 | -0.22 | 109 |
| 8 | Jamison Crowder | 2019 | 3.01 | 3.21 | -0.20 | 121 |
| 9 | Zach Ertz | 2019 | 2.73 | 2.92 | -0.19 | 138 |
'Exceptional Separation Rate' tallies all the instances where a receiver exceeds their predicted separation, and divides that by their total targets to see how often they are exceeding expectations. This metric will be less prone to outliers.
Final_ES = Stats.copy()
Final_ES['ES'] = 0
Final_ES['ES'].loc[Final_ES['Sep_Diff']> 0] = 1
Final_ES = Final_ES.merge(COUNT_1, on = ['Receiver'])
Final_ES = Final_ES[['Receiver', 'ES', 'Targets', 'Sep_Diff', 'Separation', 'Pred_Sep', 'Pass Yds', 'Air_Yds', 'Cmp', 'Cushion']]
Final_ES = Final_ES.groupby(['Receiver']).mean().round(3)
Final_ES['ES'] = (Final_ES['ES']*100).round(3)
Final_ES = Final_ES[Final_ES['Targets'] >= 130]
Final_ES = Final_ES.sort_values(['ES'], ascending = False).reset_index()
Final_ES.head(10)
| Receiver | ES | Targets | Sep_Diff | Separation | Pred_Sep | Pass Yds | Air_Yds | Cmp | Cushion | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Demarcus Robinson | 57.9 | 133.0 | 0.938 | 3.611 | 2.673 | 7.940 | 11.776 | 0.617 | 6.060 |
| 1 | Albert Wilson | 53.5 | 159.0 | 0.338 | 3.729 | 3.390 | 8.245 | 5.989 | 0.692 | 6.619 |
| 2 | Ted Ginn | 53.2 | 188.0 | 0.287 | 3.016 | 2.729 | 9.170 | 14.713 | 0.633 | 7.006 |
| 3 | Davante Adams | 52.4 | 432.0 | 0.349 | 3.037 | 2.688 | 8.236 | 10.370 | 0.658 | 5.295 |
| 4 | Sammy Watkins | 52.2 | 253.0 | 0.416 | 2.964 | 2.548 | 8.980 | 11.251 | 0.617 | 5.233 |
| 5 | Tyreek Hill | 51.2 | 373.0 | 0.383 | 3.258 | 2.874 | 10.211 | 13.119 | 0.651 | 6.184 |
| 6 | DeSean Jackson | 49.7 | 172.0 | 0.257 | 2.767 | 2.509 | 9.249 | 17.222 | 0.572 | 7.275 |
| 7 | Calvin Ridley | 49.5 | 184.0 | 0.237 | 3.021 | 2.783 | 9.109 | 11.823 | 0.685 | 6.856 |
| 8 | Christian Kirk | 49.4 | 174.0 | 0.119 | 3.180 | 3.061 | 7.466 | 9.279 | 0.638 | 6.078 |
| 9 | Cooper Kupp | 49.3 | 296.0 | 0.260 | 3.343 | 3.082 | 9.047 | 8.171 | 0.689 | 6.755 |
Final_ES.sort_values(['ES'], ascending = True).head(10)
| Receiver | ES | Targets | Sep_Diff | Separation | Pred_Sep | Pass Yds | Air_Yds | Cmp | Cushion | |
|---|---|---|---|---|---|---|---|---|---|---|
| 110 | Kelvin Benjamin | 24.7 | 145.0 | -0.403 | 1.879 | 2.282 | 7.404 | 13.430 | 0.507 | 5.424 |
| 109 | Rob Gronkowski | 30.7 | 209.0 | -0.345 | 2.390 | 2.736 | 9.712 | 11.543 | 0.642 | 5.021 |
| 108 | Larry Fitzgerald | 30.8 | 378.0 | -0.400 | 2.596 | 2.996 | 7.089 | 8.378 | 0.666 | 6.139 |
| 107 | Kenny Golladay | 31.1 | 273.0 | -0.226 | 2.121 | 2.347 | 9.746 | 14.152 | 0.579 | 5.183 |
| 105 | Devin Funchess | 31.7 | 201.0 | -0.102 | 2.381 | 2.483 | 7.426 | 12.982 | 0.564 | 5.849 |
| 106 | Golden Tate | 31.7 | 324.0 | -0.508 | 2.664 | 3.173 | 7.631 | 7.098 | 0.671 | 5.980 |
| 104 | Delanie Walker | 32.1 | 162.0 | -0.398 | 2.827 | 3.225 | 7.389 | 8.817 | 0.667 | 6.600 |
| 103 | DeVante Parker | 32.6 | 267.0 | -0.244 | 2.216 | 2.460 | 8.030 | 12.958 | 0.563 | 5.110 |
| 102 | Mike Williams | 32.8 | 194.0 | -0.252 | 1.951 | 2.204 | 9.590 | 15.794 | 0.564 | 5.024 |
| 101 | Josh Reynolds | 33.1 | 135.0 | -0.169 | 2.614 | 2.783 | 6.801 | 11.375 | 0.500 | 6.701 |
The goal of this exercise is try and extract information from recevier separation, and the analysis doesn't end with these numbers. In an attmept to see how this stat looks on the team scale, I correlated win percentage over the last three seasons with the teams average team separation differential to see how this stat did to predict team success.
WinPct = [.813, .750, .438, .563, .625, .438, .625, .375, .313, .875, .438, .750,
.688, .344, .500, .375, .500, .625, .438, .563, .438, .563, .500, .313,
.813, .438, .188, .125, .813, .250, .313, .219]
WP_AS = [.704, .551, .510, .490, .746, .451, .500, .560, .375, .673, .625, .354,
.660, .608, .440, .648, .277, .736, .540, .375, .566, .354, .647, .469,
.333, .500, .312, .354, .250, .510, .340, .383]
Teams = ['KC','GB','HOU','SF','NE','JAX','ATL','DAL','DEN','LA','PIT','TB','BAL',
'SEA','IND','PHI','CLE','NO','LAC','MIA','TEN','OAK','MIN','CAR','NYJ',
'BUF','CIN','WAS','NYG','CHI','ARI','DET']
d = {'Team': Teams, 'Win Pct Since 2017': WP_AS}
Three_Year = pd.DataFrame(d)
Teams = Stats[['Separation', 'Pred_Sep','Team']]
Teams['Sep Above Expectation'] = Teams['Separation'] - Teams['Pred_Sep']
Team_df = Teams.groupby(['Team']).mean().sort_values(['Sep Above Expectation'], ascending = False)
Combined = Team_df.merge(Three_Year, on = 'Team')
Combined.round(3).head(32)
| Team | Separation | Pred_Sep | Sep Above Expectation | Win Pct Since 2017 | |
|---|---|---|---|---|---|
| 0 | KC | 3.281 | 2.911 | 0.370 | 0.704 |
| 1 | GB | 3.200 | 2.963 | 0.236 | 0.551 |
| 2 | LA | 3.142 | 3.023 | 0.119 | 0.673 |
| 3 | NE | 2.879 | 2.799 | 0.079 | 0.746 |
| 4 | HOU | 2.902 | 2.823 | 0.079 | 0.510 |
| 5 | SEA | 2.930 | 2.855 | 0.075 | 0.608 |
| 6 | IND | 3.011 | 2.945 | 0.066 | 0.440 |
| 7 | PIT | 2.948 | 2.905 | 0.043 | 0.625 |
| 8 | DAL | 2.812 | 2.769 | 0.042 | 0.560 |
| 9 | JAX | 2.974 | 2.944 | 0.030 | 0.451 |
| 10 | ATL | 2.954 | 2.925 | 0.028 | 0.500 |
| 11 | SF | 2.991 | 2.976 | 0.015 | 0.490 |
| 12 | OAK | 2.998 | 2.983 | 0.014 | 0.354 |
| 13 | BAL | 2.950 | 2.944 | 0.005 | 0.660 |
| 14 | NO | 2.917 | 2.919 | -0.002 | 0.736 |
| 15 | PHI | 2.884 | 2.890 | -0.006 | 0.648 |
| 16 | MIN | 2.897 | 2.913 | -0.016 | 0.647 |
| 17 | DEN | 2.817 | 2.839 | -0.022 | 0.375 |
| 18 | CLE | 2.748 | 2.780 | -0.032 | 0.277 |
| 19 | LAC | 2.733 | 2.774 | -0.041 | 0.540 |
| 20 | TEN | 2.869 | 2.910 | -0.041 | 0.566 |
| 21 | CAR | 2.842 | 2.886 | -0.044 | 0.469 |
| 22 | BUF | 2.765 | 2.819 | -0.054 | 0.500 |
| 23 | NYJ | 2.908 | 2.962 | -0.054 | 0.333 |
| 24 | CHI | 2.862 | 2.921 | -0.059 | 0.510 |
| 25 | MIA | 2.804 | 2.869 | -0.065 | 0.375 |
| 26 | NYG | 2.799 | 2.867 | -0.068 | 0.250 |
| 27 | TB | 2.672 | 2.747 | -0.075 | 0.354 |
| 28 | WAS | 2.848 | 2.955 | -0.107 | 0.354 |
| 29 | ARI | 2.786 | 2.936 | -0.150 | 0.340 |
| 30 | CIN | 2.630 | 2.791 | -0.161 | 0.312 |
| 31 | DET | 2.685 | 2.889 | -0.204 | 0.383 |
fig, ax = plt.subplots(figsize = (12,8))
fig = sns.regplot(x= 'Sep Above Expectation', y = 'Win Pct Since 2017', data = Combined)
ax.set_title('Receiver Separation Above Expectation vs. Team Win Pct (Since 2017)', fontsize = 17)
ax.set_ylabel('Team Winning Pct', fontsize = 15)
fig.text(.36, .66, "KC", horizontalalignment='left', size='large', color='black', weight='semibold')
fig.text(-.2, .41, "DET", horizontalalignment='right', size='large', color='black', weight='semibold')
fig.text(.24, .52, "GB", horizontalalignment='center', size='large', color='black', weight='semibold')
fig.text(-.07, .215, "NYG", horizontalalignment='center', size='large', color='black', weight='semibold')
fig.text(.07, .76, "NE", horizontalalignment='center', size='large', color='black', weight='semibold')
fig.text(-.16, .28, "CIN", horizontalalignment='center', size='large', color='black', weight='semibold')
A = ax.set_xlabel('Avg Separation Above Expectation (yards/target)', fontsize = 15)
Amazingly enough, teams with higher average separation differentials won more often than teams with lower separation differentials.
I know what you are thinking,
"of course it is! receiver separation is a good thing, so teams with higher separation win more!"
Yes, that is true, BUT, separation differential is actually a better predictor of team success than average separation.
Corr = Combined.corr(method = 'spearman').round(2)
Corr
| Separation | Pred_Sep | Sep Above Expectation | Win Pct Since 2017 | |
|---|---|---|---|---|
| Separation | 1.00 | 0.73 | 0.79 | 0.48 |
| Pred_Sep | 0.73 | 1.00 | 0.23 | 0.07 |
| Sep Above Expectation | 0.79 | 0.23 | 1.00 | 0.66 |
| Win Pct Since 2017 | 0.48 | 0.07 | 0.66 | 1.00 |
import plotly.express as px
fig = px.scatter(Final_ES, x="Pred_Sep", y="Separation", size="Targets",
hover_name="Receiver", width=1000, height=600,
labels={
"Pred_Sep":'Predicted Separation',
"Pass Yds": "Rec Yards Per Target",
"Cmp": "Catch Rate",
'Air_Yds': 'Average Air Yards Per Target'
},
title="Predicted Separation vs Actual Separation (min. 175 targets since 2017) ")
fig.update_layout(
xaxis_title="Avg Predicted Separation (yds)",
yaxis_title="Avg Separation (yds)",
font=dict(
size=13))
fig.add_shape(
# Diagonal line
type="line",
x0=2,
y0=2,
x1=3.5,
y1=3.5,
line=dict(
color="Black",
width=1,
dash="dash",
)
)
fig.show()
The interactive plot above shows the differences between predicted and actual average separations of receivers with at least 175 targets in the last three seasons.
Players above the diagonal line excel at gaining separation relative to their peers.
This could be a separate clustering project altogether, but you will notice 4 distinct groups of players as you travel up and to the right along the line.
fig, ax = plt.subplots(figsize = (12,8))
fig = sns.regplot(x= 'Air_Yds', y = 'Separation', data = Final_ES)
ax.set_title('Average Depth of Target vs. Average Separation (Since 2017)', fontsize = 17)
ax.set_ylabel('Average Separation (Yards)', fontsize = 15)
fig.text(17.5, 2.83, "DeSean Jackson", horizontalalignment='right', size='x-large', color='black', weight='medium')
fig.text(7.5, 2.73, "Golden Tate", horizontalalignment='right', size='x-large', color='black', weight='medium')
A = ax.set_xlabel('Average Depth of Target (Air Yards)', fontsize = 15)
fig, ax = plt.subplots(figsize = (12,8))
fig = sns.regplot(x= 'Pred_Sep', y = 'Separation', data = Final_ES)
ax.set_title('Separation Above Expectation (Since 2017)', fontsize = 17)
ax.set_ylabel('Average Separation (Yards)', fontsize = 15)
fig.text(2.56, 2.81, "DeSean Jackson", horizontalalignment='right', size='large', color='black', weight='medium')
fig.text(2.29, 2.61, "+0.26 yds", size='xx-large', color='green', weight='medium',)
fig.text(3.25, 2.58, "Golden Tate", horizontalalignment='right', size='large', color='black', weight='medium')
fig.text(3.21, 2.90, "-0.51 yds", size='xx-large', color='red', weight='medium')
x_values = [3.175, 3.175]
y_values = [3.20, 2.67]
plt.plot(x_values, y_values, color = 'red', linewidth='2')
x_values = [2.51, 2.51]
y_values = [2.51, 2.75]
plt.plot(x_values, y_values, color = 'green', linewidth='2')
A = ax.set_xlabel('Average Predicted Separation (Yards)', fontsize = 15)
If I had more time/resources/data, what would I do to make these predictions better?
Thank you all for reading. If you have any questions, suggestions, or any feedback at all, feel free to reach out at JesseDCohen@gmail.com.